{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第三题：支持向量机的分类任务\n",
    "\n",
    "实验内容：\n",
    "1. 使用支持向量机完成spambase垃圾邮件分类任务\n",
    "2. 使用训练集训练模型，计算测试集的精度，查准率，查全率，F1值"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 请你使用SVC，完成spambase分类任务\n",
    "\n",
    "要求：使用全部特征，完成以下内容的填写\n",
    "\n",
    "###### 双击此处填写\n",
    "\n",
    "核函数 | C | 精度 | 查准率 | 查全率 | F1\n",
    "- | - | - | - | - | -\n",
    "rbf | 0.1 | 0.0 | 0.0 | 0.0 | 0.0\n",
    "rbf | 1 | 0.0 | 0.0 | 0.0 | 0.0\n",
    "linear | 0.1 | 0.0 | 0.0 | 0.0 | 0.0\n",
    "linear | 1 | 0.0 | 0.0 | 0.0 | 0.0\n",
    "sigmoid | 0.1 | 0.0 | 0.0 | 0.0 | 0.0\n",
    "sigmoid | 1 | 0.0 | 0.0 | 0.0 | 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入数据\n",
    "import numpy as np\n",
    "data = np.loadtxt('data/spambase/spambase.data', delimiter = \",\")\n",
    "spamx = data[:, :57]\n",
    "spamy = data[:, 57]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((3220, 57), (3220,), (1381, 57), (1381,))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 数据集分割\n",
    "from sklearn.model_selection import train_test_split\n",
    "trainX, testX, trainY, testY = train_test_split(spamx, spamy, test_size = 0.3, random_state = 32)\n",
    "trainX.shape, trainY.shape, testX.shape, testY.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**注意：计算线性核的时候，要使用 LinearSVC 这个类，不要使用SVC(kernel = 'linear')。LinearSVC不需要设置kernel参数！**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 引入模型\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import precision_score\n",
    "from sklearn.metrics import recall_score\n",
    "from sklearn.metrics import f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rbf：C=0.1\n",
      "精度： 0.7 查准率： 0.65 查全率： 0.46 f1： 0.54\n"
     ]
    }
   ],
   "source": [
    "clf_r0=SVC(kernel='rbf',C=0.1)\n",
    "clf_r0.fit(trainX,trainY)\n",
    "prediction=clf_r0.predict(testX)\n",
    "as_spam_r=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_r=round(precision_score(testY,prediction),2)\n",
    "rs_spam_r=round(recall_score(testY,prediction),2)\n",
    "f1_spam_r=round(f1_score(testY,prediction),2)\n",
    "print('rbf：C=0.1')\n",
    "print('精度：',as_spam_r,'查准率：',ps_spam_r,'查全率：',rs_spam_r,'f1：',f1_spam_r)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rbf：C=1\n",
      "精度： 0.72 查准率： 0.67 查全率： 0.49 f1： 0.57\n"
     ]
    }
   ],
   "source": [
    "clf_r0=SVC(kernel='rbf',C=1)\n",
    "clf_r0.fit(trainX,trainY)\n",
    "prediction=clf_r0.predict(testX)\n",
    "as_spam_r=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_r=round(precision_score(testY,prediction),2)\n",
    "rs_spam_r=round(recall_score(testY,prediction),2)\n",
    "f1_spam_r=round(f1_score(testY,prediction),2)\n",
    "print('rbf：C=1')\n",
    "print('精度：',as_spam_r,'查准率：',ps_spam_r,'查全率：',rs_spam_r,'f1：',f1_spam_r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "线性：C=0.1\n",
      "精度： 0.85 查准率： 0.75 查全率： 0.92 f1： 0.82\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  warnings.warn(\"Liblinear failed to converge, increase \"\n"
     ]
    }
   ],
   "source": [
    "clf=LinearSVC(C=0.1)\n",
    "clf.fit(trainX,trainY)\n",
    "LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,\n",
    "          intercept_scaling=1, loss='squared_hinge', max_iter=100000,\n",
    "          multi_class='ovr', penalty='l2', random_state=32, tol=0.0001,\n",
    "          verbose=0)\n",
    "prediction=clf.predict(testX)\n",
    "as_spam_l=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_l=round(precision_score(testY,prediction),2)\n",
    "rs_spam_l=round(recall_score(testY,prediction),2)\n",
    "f1_spam_l=round(f1_score(testY,prediction),2)\n",
    "print('线性：C=0.1')\n",
    "print('精度：',as_spam_l,'查准率：',ps_spam_l,'查全率：',rs_spam_l,'f1：',f1_spam_l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "线性：C=1\n",
      "精度： 0.84 查准率： 0.73 查全率： 0.91 f1： 0.81\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\anaconda3\\lib\\site-packages\\sklearn\\svm\\_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  warnings.warn(\"Liblinear failed to converge, increase \"\n"
     ]
    }
   ],
   "source": [
    "clf=LinearSVC(C=1)\n",
    "clf.fit(trainX,trainY)\n",
    "prediction=clf.predict(testX)\n",
    "as_spam_l=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_l=round(precision_score(testY,prediction),2)\n",
    "rs_spam_l=round(recall_score(testY,prediction),2)\n",
    "f1_spam_l=round(f1_score(testY,prediction),2)\n",
    "print('线性：C=1')\n",
    "print('精度：',as_spam_l,'查准率：',ps_spam_l,'查全率：',rs_spam_l,'f1：',f1_spam_l)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf_s0=SVC(kernel='sigmoid',C=0.1,random_state=32)\n",
    "clf_s0.fit(trainX,trainY)\n",
    "prediction=clf_s0.predict(testX)\n",
    "as_spam_s=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_s=round(precision_score(testY,prediction),2)\n",
    "rs_spam_s=round(recall_score(testY,prediction),2)\n",
    "f1_spam_s=round(f1_score(testY,prediction),2)\n",
    "print('sigmoid：C=0.1')\n",
    "print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)\n",
    "\n",
    "clf_s1=SVC(kernel='sigmoid',C=1,random_state=32)\n",
    "clf_s1.fit(trainX,trainY)\n",
    "prediction=clf_s1.predict(testX)\n",
    "as_spam_s=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_s=round(precision_score(testY,prediction),2)\n",
    "rs_spam_s=round(recall_score(testY,prediction),2)\n",
    "f1_spam_s=round(f1_score(testY,prediction),2)\n",
    "print('sigmoid：C=1')\n",
    "print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf_s0=SVC(kernel='sigmoid',C=0.1,random_state=32)\n",
    "clf_s0.fit(trainX,trainY)\n",
    "prediction=clf_s0.predict(testX)\n",
    "as_spam_s=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_s=round(precision_score(testY,prediction),2)\n",
    "rs_spam_s=round(recall_score(testY,prediction),2)\n",
    "f1_spam_s=round(f1_score(testY,prediction),2)\n",
    "print('sigmoid：C=0.1')\n",
    "print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)\n",
    "\n",
    "clf_s1=SVC(kernel='sigmoid',C=1,random_state=32)\n",
    "clf_s1.fit(trainX,trainY)\n",
    "prediction=clf_s1.predict(testX)\n",
    "as_spam_s=round(accuracy_score(testY,prediction),2)\n",
    "ps_spam_s=round(precision_score(testY,prediction),2)\n",
    "rs_spam_s=round(recall_score(testY,prediction),2)\n",
    "f1_spam_s=round(f1_score(testY,prediction),2)\n",
    "print('sigmoid：C=1')\n",
    "print('精度：',as_spam_s,'查准率：',ps_spam_s,'查全率：',rs_spam_s,'f1：',f1_spam_s)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 选做：比较LinearSVC和SVC(kernel = 'linear')的运行时间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# YOUR CODE HERE\n",
    "import time\n",
    "\n",
    "\n",
    "start = time.time()\n",
    "clf=LinearSVC(C=0.1,random_state=32)\n",
    "clf.fit(trainX,trainY)\n",
    "LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,\n",
    "          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
    "          multi_class='ovr', penalty='l2', random_state=32, tol=0.0001,\n",
    "          verbose=0)\n",
    "prediction=clf.predict(testX)\n",
    "end = time.time()\n",
    "print(\"LinearSVC运行时间\",start-end)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
